In [1]:
import seaborn as sns
import pandas as pd
import numpy as np
import plotly.express as px
%pylab inline

import sklearn as sk
import sklearn.tree as tree
from IPython.display import Image  
import pydotplus
import us
from sklearn.datasets import load_iris
Populating the interactive namespace from numpy and matplotlib
In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import warnings
warnings.filterwarnings("ignore")
In [3]:
df=pd.read_csv('nhtsa_survey_data.csv')
In [4]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

Data cleaning

In [5]:
df.rename(columns={'Unnamed: 0':'Index','number_of_vehicle_forms_submitted_all':'Total_vehicle_forms',\
                    'number_of_motor_vehicles_in_transport_mvit':'MV_transport',\
                    'number_of_parked_working_vehicles':'Parked_vehicles',\
                    'number_of_forms_submitted_for_persons_not_in_motor_vehicles':'Total_Persons_notin_MV',\
                    'number_of_persons_not_in_motor_vehicles_in_transport_mvit':'Total_Persons_notin_MVIT',\
                    'number_of_persons_in_motor_vehicles_in_transport_mvit':'Total_Persons_in_MVIT',\
                    'number_of_forms_submitted_for_persons_in_motor_vehicles':'Total_Persons_in_MV'}, inplace=True)
In [6]:
df.set_index('Index', inplace=True)
In [7]:
df.replace({'day_of_week':{1:'Sunday',2:'Monday',\
                           3:'Tuesday',4:'Wednesday',\
                           5:'Thursday',6:'Friday',7:'Saturday'}}, inplace=True)
In [8]:
df.replace({'month_of_crash':{1:'Jan',2:'Feb',3:'Mar',\
                              4:'Apr',5:'May',6:'June',\
                              7:'July',8:'Aug',9:'Sep',\
                              10:'Oct',11:'Nov',12:'Dec'}}, inplace=True)
In [9]:
df.drop(columns=["atmospheric_conditions_2_name", "atmospheric_conditions_name"], inplace=True)
In [10]:
list_col = df.iloc[:,19:36].columns.tolist()
In [11]:
list_col = ["route_name", "trafficway", "latitude", "longitude",\
           "special_juris", "first_crash_event", "Manner_collision",\
           "interchange_area", "junction_location","intersection_type", \
           "work_zone", "relation_trafficway", "light_condition",\
           "atmospheric_conditions", "School_bus_rel", "crash_factors",\
           "fatalities_num"]
In [12]:
df_ren = df.columns.tolist()
In [13]:
df_ren[19:36] = list_col
In [14]:
df.columns = df_ren
In [15]:
df.replace({'first_crash_event':{'Rollover/Overturn':'Rollover',\
                                'Fire/Explosion':'Explosion',\
                                 'Immersion (or Partial Immersion, Since 2012)':'Immersion',\
                                'Fell/Jumped from Vehicle':'fell_out',\
                                 'Injured in Vehicle (Non-Collision)':'Injured_Non_Col',\
                                'Other Non-Collision':'Other_Non_col',\
                                 'Motor Vehicle in Transport':'MV_transport',\
                                 'Parked Motor Vehicle (Not In Transport)':'Parked_Vehicle',\
                                 'Other Object (Not Fixed)':'Object_NFix',\
                                 'Impact Attenuator/Crash Cushion':'Crash_Cushion',\
                                 'Bridge Pier or Support':'Bridge_pier',\
                                 'Bridge Rail (Includes Parapet)':'Bridge_railling',\
                                 'Concrete Traffic Barrier':'Traffic_barrier',\
                                 'Other Traffic Barrier':'Traffic_barrier',\
                                 'Traffic Sign Support':'Traffic_barrier',\
                                 'Utility Pole/Light Support':'Pole',\
                                 'Other Post, Other Pole, or Other Support':'Pole',\
                                 'Tree (Standing Only)':'Tree','Other Fixed Object':'Fixed_obj',\
                                 'Pavement Surface Irregularity (Ruts, Potholes, Grates, etc.)':'Pavement_issue',\
                                 'Working Motor Vehicle':'Working_MV','Guardrail Face':'Gaurdrail',\
                                 'Guardrail End':'Gaurdrail',\
                                 'Jackknife (Harmful to This Vehicle)':'Jacknife',\
                                 'Motor Vehicle In-Transport Strikes or is Struck by Cargo':'MV_Moving',\
                                 'Persons or\nObjects Set-in-Motion from/by Another Motor Vehicle In-Transport':\
                                 'MV_Moving','Cable Barrier (Since 2008)':'Cable_barrier',\
                                 'Cargo/Equipment Loss or Shift (Harmful to This Vehicle)':'Cargo_loss',\
                                 'Object Fell From Motor Vehicle In-Transport (Since 2013)':'Obj_Fell',\
                                 'Thrown or Falling Object':'Obj_Fell',\
                                 'Ridden Animal or Animal-Drawn Conveyance (Since 1998)':'Animal_Ridding',\
                                 'Non-Motorist on Personal Conveyance':'NM_Conveyance'}}, inplace=True)                          
In [16]:
df.Manner_collision.\
    replace(to_replace="Not Collision with Motor Vehicle in Transport (Not Necessarily in Transport for\n2005-2009)",\
                                     value="No_collision_MV", inplace=True)
In [17]:
df.rename(columns={'died_at_scene_en_route': 'death'},inplace=True)
In [18]:
df.rename(columns={ 'police_reported_drug_involvement':'drug',\
                               'police_reported_alcohol_involvement':'alcohol',\
                                'air_bag_deployed_name':'airbag',\
                               'indication_of_misuse_of_restraint_system_helmet':'misuse_restraint',\
                               'restraint_system_helmet_use_name':'restraint',\
                               'number_of_fatalities':'fatalities_num',
                               'number_of_drunk_drivers': 'drunk_drivers_num',
                               'driver_distracted_by_name': 'driver_distracted',
                               'driver_maneuvered_to_avoid_name': 'driver_maneuvered',
                               'injury_severity_name': 'injury_severity',
                               'person_type_name': 'person_type',
                               'violations_charged_name': 'violations'}, inplace = True)
In [19]:
df.replace({'death':{'Died at Scene':'At Scene',\
                           'Died En Route':'En Route',\
                                 'Not Applicable':'No'}}, inplace=True)
In [20]:
df.replace({'drug':{'Unknown (Police Reported)':'Unknown',\
                           'No (Drugs Not Involved)':'No',\
                                 'Yes (Drugs Involved)':'Yes'}}, inplace=True)
In [21]:
df.replace({'alcohol':{'Unknown (Police Reported)':'Unknown',\
                           'No (Alcohol Not Involved)':'No',\
                                 'Yes (Alcohol Involved)':'Yes'}}, inplace=True)
In [22]:
df.replace({'airbag':{'Not Deployed':'No',\
                        'Deployment Unknown':'Unknown'}},inplace=True)
In [23]:
df['air_bag'] = df.airbag.apply(lambda x: 'Deployed' if 'Deployed:' in x else x)
In [24]:
df.air_bag.replace(to_replace='No',value='Not Deployed', inplace=True)
In [25]:
df.drop(['airbag'], axis=1,inplace=True)
In [26]:
df.replace({'restraint':{'None Used':'None',\
                           'Unknown if Helmet Worn':'Unknown'}}, inplace=True)
In [27]:
df['maneuvered'] = df.driver_maneuvered.apply(lambda x: 'Motor Vehicle' if 'Motor Vehicle' in x else x)
In [28]:
df.drop(['driver_maneuvered'], axis=1, inplace=True)
In [29]:
df.replace({'injury_severity':{'Fatal Injury (K)':'Death',\
                        'Suspected Serious Injury (A)':'Suspected Serious',
                            'Suspected Minor Injury (B)':'Suspected Minor',
                            'Injured, Severity Unknown (U) (Since 1978)':'Injured, Severity Unknown',
                           'Possible Injury (C)':'Possible Injury',
                            'No Apparent Injury (O)':'No Apparent Injury'}},inplace=True)
In [30]:
df["violation_reduce"] = df.violations.\
                apply(lambda x: "RECKLESS/CARELESS/HIT-AND-RUN" if "RECKLESS/CARELESS/HIT-AND-RUN" in x\
                         else "IMPAIRMENT OFFENSES" if "IMPAIRMENT OFFENSES" in x\
                         else "RULES OF THE ROAD" if "RULES OF THE ROAD" in x\
                         else "NON-MOVING_LICENSE" if "LICENSE" in x\
                         else "SPEEDING" if "SPEED-RELATED" in x\
                         else "EQUIPMENT" if "EQUIPMENT" in x else "None")
In [31]:
df.replace({'Manner_collision':{'No_collision_MV':'Single_vehicle','Angle':'Multi_vehicle',\
                'Front-to-Front':'Multi_vehicle',\
                'Front-to-Rear':'Multi_vehicle','Sideswipe – Same Direction':'Multi_vehicle',\
                'Sideswipe – Opposite Direction':'Multi_vehicle' ,\
                'Other (End-Swipes and Others)':'Multi_vehicle',\
                'Rear-to-Side':'Multi_vehicle','Rear-to-Rear':'Multi_vehicle', \
                'Not Reported':'Unknown'}}, inplace=True)
In [32]:
df.replace({'injury_severity':{'No Apparent Injury':'Not_injured','Suspected Minor':'Injured',\
            'Suspected Serious':'Injured','Possible Injury':'Injured','Injured, Severity Unknown':'Injured',\
            'Died Prior to Crash':'Death'}}, inplace=True)
In [33]:
df.replace({'restraint':{'No Helmet':'None', 'Not Reported':'Unknown'}}, inplace=True)
df['restraint_binary']=df.restraint.apply(lambda x : "Restraint Used" if 'Belt' in x or 'Restraint' in x\
                                           or 'Helmet' in x or 'Other' in x \
                                     else 'Unknown' if 'Unknown' in x \
                                        else "Restraint not used")
In [34]:
df.replace({'air_bag':{'Switched Off':'Not Deployed', \
                        'Not Reported':'Unknown', \
                       'Not Applicable':'Deployed'}}, inplace=True)
In [35]:
df.replace({'first_crash_event':{'MV_transport':'Collision_with_moving_obj',\
            'Pedestrian':'Collision_with_moving_obj', \
            'Rollover':'Non_collision','Tree':'Collision_with_fix_obj',\
            'Curb':'Collision_with_fix_obj', \
            'Gaurdrail':'Collision_with_fix_obj','Embankment':'Collision_with_fix_obj', \
            'Ditch':'Collision_with_fix_obj','Pole':'Collision_with_fix_obj',\
            'Traffic_barrier':'Collision_with_fix_obj',
            'Pedalcyclist':'Collision_with_moving_obj','Culvert':'Collision_with_fix_obj',\
            'Parked_Vehicle':'Collision_with_moving_obj', \
            'Fence':'Collision_with_fix_obj','fell_out':'Non_collision',\
            'Mail Box':'Collision_with_fix_obj', \
            'Fixed_obj':'Collision_with_fix_obj','Live Animal':'Collision_with_moving_obj',\
            'Object_NFix':'Collision_with_moving_obj', \
            'NM_Conveyance':'Collision_with_moving_obj','Bridge_railling':'Collision_with_fix_obj',\
            'Cable_barrier':'Collision_with_fix_obj',  \
            'Bridge_pier':'Collision_with_fix_obj','Railway Vehicle':'Collision_with_moving_obj',\
            'Wall':'Collision_with_fix_obj', \
            'Ground':'Collision_with_fix_obj','Boulder':'Collision_with_fix_obj',\
            'Crash_Cushion':'Collision_with_fix_obj', \
            'Immersion':'Non_collision','Traffic Signal Support':'Collision_with_fix_obj',\
            'Building':'Collision_with_fix_obj',  \
            'MV_Moving':'Collision_with_moving_obj','Shrubbery':'Collision_with_fix_obj',\
            'Other_Non_col':'Non_collision',  \
            'Cargo_loss':'Non_collision','Pavement_issue':'Collision_with_fix_obj',\
            'Fire Hydrant':'Collision_with_fix_obj',  \
            'Obj_Fell':'Collision_with_moving_obj','Snow Bank':'Collision_with_fix_obj',\
            'Jacknife':'Non_collision',  \
            'Working_MV':'Collision_with_moving_obj','Animal_Ridding':'Collision_with_moving_obj', \
            'Bridge Overhead Structure':'Collision_with_fix_obj','Injured_Non_Col':'Non_collision', \
            'Explosion':'Non_collision'}},inplace=True)
In [36]:
df.drop(columns=['Total_vehicle_forms','MV_transport', 'Parked_vehicles','Total_Persons_in_MV', \
                'Total_Persons_notin_MV','Total_Persons_notin_MVIT','Total_Persons_in_MVIT',\
                 'special_juris'], inplace=True)
In [37]:
df.drop(columns=['functional_system_name','ownership_name','route_name','trafficway',\
                 'interchange_area','junction_location',\
                 'intersection_type','work_zone','relation_trafficway',\
                'School_bus_rel','crash_factors','timestamp_of_crash','driver_distracted',\
                'person_number','person_type','misuse_restraint','maneuvered',\
                'county','city',\
                'violations','day_of_crash','month_of_crash','restraint','vehicle_number'], inplace=True)

First 5 lines of the dataset after data cleaning

In [38]:
df.head()
Out[38]:
state_name consecutive_number day_of_week hour_of_crash national_highway_system ... drug death air_bag violation_reduce restraint_binary
Index
0 Wyoming 560099 Sunday 3 1 ... Unknown At Scene Not Deployed None Restraint not used
1 Idaho 160226 Friday 16 1 ... No No Not Deployed None Restraint Used
2 Idaho 160226 Friday 16 1 ... Not Reported At Scene Not Deployed None Restraint not used
3 Idaho 160226 Friday 16 1 ... Not Reported No Not Deployed None Restraint not used
4 Arkansas 50006 Wednesday 3 1 ... Not Reported No Not Deployed RECKLESS/CARELESS/HIT-AND-RUN Restraint not used

5 rows × 23 columns

In [39]:
df.shape
Out[39]:
(84579, 23)

Dataset Description:

This dataset contains information on traffic accidents and fatalities collected from the Fatality Analysis Reporting System (FARS) and National Highway Traffic Safety Administration (NHTSA). It reports fatal crashes in 51 states of US in 2016. The cleaned dataset has 23 columns and 84579 rows.

Columns description:

  1. state_name - name of 51 states in US
  2. consecutive_number - an unique number that is assigned to the car involved in the accident
  3. day_of_week - the day in the week when the accident occured
  4. hour_of_crash - the hour of the day when the accident occured
  5. national_highway_system - if the accident occured in national highway system
  6. land_use_name - the type of land use where the accident occured
  7. latitude - the latitude of where the accident occured
  8. longitude - the longitude of where the accident occured
  9. first_crash_event - the first damage event that was produced by the crash
  10. Manner_collision - the way vehicle collided that led to the crash
  11. fatalities_num - number of fatalities involved in the accident
  12. drunk_drivers_num - number of drunk drivers involved in the accident
  13. age - age of the person involved in the accident
  14. sex - gender of the person involved in the accident
  15. injury_severity - the level of injury severity of the person right after the accident
  16. alcohol - if the accident is an alcohol-influenced accident
  17. drug - if the accident is a drug-influenced accident
  18. death - if the person was dead right after the accident
  19. air_bag - if the airbag was deployed in the accident
  20. violation_reduce - the type of the violation that the driver violated
  21. restraint_binary - if restraints were being used in the accident
  22. light_condition - the light condition when the accident occured
  23. atmospheric_conditions - the atmospheric condition when the accident occured

Goal 1:

Fatality distribution with respect to different types of violations like speeding, rule breaking, hit and run, etc. Distribution of violations based on location of accident, land use, atmospheric conditions, lighting.

Graph showing distribution of violations

In [40]:
df_vio = df[df.violation_reduce != 'None']
In [41]:
df_vio.groupby('violation_reduce')['violation_reduce'].size().sort_values(ascending=False)
Out[41]:
violation_reduce
RECKLESS/CARELESS/HIT-AND-RUN    5352
NON-MOVING_LICENSE               3919
IMPAIRMENT OFFENSES              2518
RULES OF THE ROAD                2494
SPEEDING                          771
EQUIPMENT                         504
Name: violation_reduce, dtype: int64
In [42]:
sns.countplot(y='violation_reduce', data=df_vio)
Out[42]:
<matplotlib.axes._subplots.AxesSubplot at 0x11ca17280>

Graph of Fatality distribution with respect to different types of violations

In [43]:
df_vio.groupby('violation_reduce')['fatalities_num'].mean().sort_values(ascending=False)
Out[43]:
violation_reduce
SPEEDING                         1.683528
RECKLESS/CARELESS/HIT-AND-RUN    1.250747
EQUIPMENT                        1.136905
IMPAIRMENT OFFENSES              1.122319
NON-MOVING_LICENSE               1.099260
RULES OF THE ROAD                1.072574
Name: fatalities_num, dtype: float64
In [44]:
sns.catplot(x='fatalities_num', data=df_vio, y='violation_reduce', aspect=2, kind='bar')
Out[44]:
<seaborn.axisgrid.FacetGrid at 0x120ed2ac0>

Observation: Speeding ranks the second lowest in frequency considering the type of violation for cars that involved in accident in year 2016. However, it led to the highest average number of fatalities among all other violations.

Graph of violations and fatality number by land use

In [45]:
df_land= df_vio[(df.land_use_name == 'Rural') | (df.land_use_name == 'Urban')]
In [46]:
sns.catplot(x='fatalities_num',y='violation_reduce',hue='land_use_name', data=df_land,kind='bar',aspect=3)
Out[46]:
<seaborn.axisgrid.FacetGrid at 0x121be0ac0>

Observation: Speeding in rural area caused higher fatality rate in urban area.

Graph showing if national highway system not or if accident accured in rural area due to speeding

In [47]:
speed_df = df[(df.violation_reduce == 'SPEEDING')]
In [48]:
vio_land_df = df[(df.violation_reduce == 'SPEEDING') & (df.land_use_name == 'Rural')]
In [49]:
vio_land_df.groupby('national_highway_system')['fatalities_num'].count().sort_values(ascending=False)
Out[49]:
national_highway_system
1    272
0    202
Name: fatalities_num, dtype: int64
In [50]:
sns.catplot(x='violation_reduce',hue='national_highway_system', \
            data=vio_land_df,kind='count',aspect=2)
Out[50]:
<seaborn.axisgrid.FacetGrid at 0x120ed2a00>

Observation: Speeding happened more often in national highway system than that of non-national highway.

Graph of violations and fatality number by light condition

In [51]:
df.rename(columns={'light_condition': 'light_condition1'},inplace=True)
In [52]:
df['light_condition'] = df.light_condition1.apply(lambda x: "Dark" if "Dark" in x else x)
In [53]:
df_light = df[((df.light_condition == 'Dark') | (df.light_condition == 'Daylight')|(df.light_condition == 'Dusk')|\
(df.light_condition == 'Dawn')) & (df.violation_reduce != 'None') ]
In [54]:
df.drop(['light_condition1'], axis=1, inplace=True)
In [55]:
sns.catplot(x='violation_reduce', y='fatalities_num',hue='light_condition', \
            data=df_light, kind='bar',aspect=2).set_xticklabels(rotation=90)
Out[55]:
<seaborn.axisgrid.FacetGrid at 0x1213ad8b0>

Observation: Accidents caused by speeding happened at daylight have higher fatality rate.

Graph of violations and fatality number by atmospheric_condition

In [56]:
df.rename(columns={'atmospheric_conditions': 'atmospheric_conditions1'},inplace=True)
In [57]:
df['atmospheric_conditions'] = df.atmospheric_conditions1.\
                apply(lambda x: "Snow" if "Snow" in x\
                         else "Rain" if "Rain" in x\
                         else x)
In [58]:
df_atom = df[((df.atmospheric_conditions != 'Unknown') & (df.atmospheric_conditions != 'Not Reported')) & \
            (df.violation_reduce != 'None')]
In [59]:
df.drop(['atmospheric_conditions1'], axis=1, inplace=True)
In [60]:
sns.catplot(x='violation_reduce', y='fatalities_num',hue='atmospheric_conditions', \
            data=df_atom,kind='bar',aspect=3)
Out[60]:
<seaborn.axisgrid.FacetGrid at 0x120e12cd0>

Observation: Accidents caused by speeding happened in rainy days have higher fatality rate.

Graph of average fatality number caused by speeding in different states

In [61]:
speed_sort = speed_df.groupby('state_name')['fatalities_num'].mean().sort_values(ascending=False).reset_index()
In [62]:
speed_sort['state_name_abbr']=speed_sort.state_name.apply(lambda x:us.states.lookup(x).abbr)


fig = px.choropleth(speed_sort, 
                    locations="state_name_abbr",
                    locationmode='USA-states',
                    color="fatalities_num", 
                    hover_name="state_name",
                    scope='usa',
                    title='Average fatality number caused by speeding in different states',
                    width=800,
                    height=400,
                    color_continuous_scale=px.colors.sequential.Tealgrn)
fig.show()

Observation: Texas and Mississipi have the highest fatality number considering speeding violations. The remaining sates that are not colored do not have fatality reported for speeding violations.

Conclusion:

Speeding causes the highest average fatality among all violations. The fatality rate due to speeding in rural area on national highway system, under daylight and in rainy days are way higher than other land use, light condition and atomospheric condition.

Recommendation:

Since Texas and Missisippi are states that cause the highest average fatality number by speeding, it is suggested that the state government to strictly enforcement all cars to install weather-proof tires and install more speed meters in in rural area on national highway system.

Goal 2

Distribution of drug or alcohol influenced accidents base on age and time

In [63]:
df_QT = df.loc[:,['state_name', 'consecutive_number','day_of_week', 'hour_of_crash',\
                 'drunk_drivers_num', 'age','alcohol', 'drug', 'fatalities_num','death']]
In [64]:
df_QT["death_bin"] = df_QT.death.apply(lambda x: 1 if 'At Scene' in x\
                         else 1 if 'En Route' in x else 0)
In [65]:
df_QT["alcohol_bin"] = df_QT.alcohol=="Yes"
In [66]:
df_QT["drug_bin"] = df_QT.drug=="Yes"
In [67]:
df_QT["Age_bin1"] = pd.cut(df_QT.age, bins=[0,28,45,65,85,120])
In [68]:
df_QT["Hours_bin"] = pd.cut(df_QT.hour_of_crash, bins=[0,6,12,18,24])

Alcohol influence in accidents and fatalities

In [69]:
sns.countplot(x = "alcohol_bin", data=df_QT).\
            set(xlabel = "Alcohol Consumption", ylabel = "Accidents",\
               title = "Distribution of Accicents by Alcohol Consumption")
Out[69]:
[Text(0, 0.5, 'Accidents'),
 Text(0.5, 0, 'Alcohol Consumption'),
 Text(0.5, 1.0, 'Distribution of Accicents by Alcohol Consumption')]
In [70]:
df_QT.groupby("alcohol_bin").agg({"fatalities_num":"sum","death_bin":"mean"}).\
rename(columns={ "fatalities_num": "Injured People","death_bin":"Average_death"})
Out[70]:
Injured People Average_death
alcohol_bin
False 89942 0.206594
True 10066 0.364939
In [71]:
sns.regplot(x="alcohol_bin", y= "death_bin", data = df_QT).\
            set(xlabel = "Alcohol Consumption", ylabel = "Death", title = "Effect of alcohol over death rate")
Out[71]:
[Text(0, 0.5, 'Death'),
 Text(0.5, 0, 'Alcohol Consumption'),
 Text(0.5, 1.0, 'Effect of alcohol over death rate')]

Observations: There are way fewer accidents caused by with alcohol influence than that without. However, the effect of alcohol influence is essential because death rate of an accident increases as the alcohol consumption increases.

Distribution of age and alcohol-influenced accidents

In [72]:
sns.catplot(x = "Age_bin1", kind = "count", \
            data=df_QT[df_QT.alcohol == "Yes"], aspect=2 ).\
            set(xlabel = "Age Group", ylabel = "Accidents")
Out[72]:
<seaborn.axisgrid.FacetGrid at 0x121de71c0>

Observations: There are more alcohol-influenced accidents for drivers between 0 to 28 years old.

In [ ]:
 

Distribution of alcohol influenced accidents by hours bin

In [73]:
sns.catplot(x = "alcohol_bin",hue = "Hours_bin",
            kind = "count", \
            data=df_QT[df_QT.alcohol == "Yes"], aspect=2 ).\
            set(xlabel = "Hours bin", ylabel = "Accidents count")
Out[73]:
<seaborn.axisgrid.FacetGrid at 0x1239b5730>

Distribution of alcohol influenced accidents by hours and weekend

In [74]:
df_QT['weekend']=df['day_of_week'].apply(lambda x: 1 if x=='Saturday' else 1 if x=='Sunday' else 0 )
In [75]:
sns.catplot(x = "alcohol_bin",hue = "Hours_bin", col="weekend",
            kind = "count", \
            data=df_QT[df_QT.alcohol == "Yes"], aspect=2 ).\
            set(xlabel = "Hours bin", ylabel = "Accidents count")
Out[75]:
<seaborn.axisgrid.FacetGrid at 0x11ebb7850>

Observations: There is no big difference in count in alcohol influenced accidents in weekend or weekday. However, there are more alcohol influenced accidents happened between 6pm and midnight on weekdays; as well as between midnight to 6am on weekends.

Drug influence in accidents and fatalities

In [76]:
df_QT.groupby(["drug_bin"])["death_bin"].mean()
Out[76]:
drug_bin
False    0.215730
True     0.354699
Name: death_bin, dtype: float64
In [77]:
sns.countplot(x = "drug_bin", data=df_QT).\
            set(xlabel = "Drug Consumption", ylabel = "Accidents",\
               title = "Distribution of Accicents by Drug Consumption")
Out[77]:
[Text(0, 0.5, 'Accidents'),
 Text(0.5, 0, 'Drug Consumption'),
 Text(0.5, 1.0, 'Distribution of Accicents by Drug Consumption')]
In [78]:
sns.regplot(x="drug_bin", y= "death_bin", data = df_QT).\
            set(xlabel = "Drug Consumption", ylabel = "Death", title = "Effect of drug over death rate")
Out[78]:
[Text(0, 0.5, 'Death'),
 Text(0.5, 0, 'Drug Consumption'),
 Text(0.5, 1.0, 'Effect of drug over death rate')]

Observations: There are way fewer accidents caused by with drug influence than that without. However, the effect of drug influence is essential because death rate of an accident increases as the drug consumption increases.

In [79]:
sns.catplot(x = "Age_bin1", kind = "count", data=df_QT[df_QT.drug_bin==1]).\
set(xlabel = "Age Group", ylabel = "Accidents")
Out[79]:
<seaborn.axisgrid.FacetGrid at 0x11f4538e0>

Observations: There are more drug-influenced accidents for drivers between 0 to 28 years old.

Distribution of drug influenced accidents by hours bin

In [80]:
sns.catplot(x = "drug_bin",hue = "Hours_bin",
            kind = "count", \
            data=df_QT[df_QT.drug == "Yes"], aspect=2 ).\
            set(xlabel = "Hours bin", ylabel = "Accidents count")
Out[80]:
<seaborn.axisgrid.FacetGrid at 0x11f5f9d60>

Distribution of drug influenced accidents by hours and weekend

In [81]:
sns.catplot(x = "drug_bin",hue = "Hours_bin", col="weekend",
            kind = "count", \
            data=df_QT[df_QT.drug == "Yes"], aspect=2 ).\
            set(xlabel = "Hours bin", ylabel = "Accidents count")
Out[81]:
<seaborn.axisgrid.FacetGrid at 0x1211e1e20>

Observations: There are way more drug influenced accidents on weekdays compare to weekends, and that from 12pm to midnight recorded to have the highest frequency of drug influenced accidents.

Probability of alcohol or drug involvement if there is an outcome of human death

In [82]:
df_QT.groupby(["alcohol_bin", "drug_bin"])["death_bin"].mean()
Out[82]:
alcohol_bin  drug_bin
False        False       0.203092
             True        0.306521
True         False       0.350058
             True        0.414682
Name: death_bin, dtype: float64

Conclusion:

  1. Age between 0 to 28 is the age group that have highest alcohol and drug influenced accidents
  2. There are way more alcohol influenced accidents from midnight to 6am on weekends compare to other times.
  3. There are way more drug influenced accidents from 12pm to midnight on weekdays compare to other times.

Recommendation:

  1. Increase the penalty of alcohol/ drug involvement accident, like revoking driving license for a year once found out alcohol/drug is involved.
  2. More police patrols around the above frequent drug and/or alchol influenced acccident times to spot for weird driving pattern cars on the road.

Goal 3

Distribution of injury severity with respect to manner of collision and the importance of safety measures to prevent death

In [83]:
df_pct=df[(df.Manner_collision!='Unknown') &\
             (df.first_crash_event!='Unknown') & (df.air_bag!='Unknown') &\
            (df.injury_severity!='Unknown') & (df.restraint_binary!='Unknown') & \
          ((df.violation_reduce=='SPEEDING') | (df.violation_reduce=='IMPAIRMENT OFFENSES')) & \
              (df.atmospheric_conditions=='Clear')]

Distribution of injury severity with respect to manner of collision

In [84]:
fr_df=df_pct.groupby('Manner_collision')['injury_severity'].value_counts().unstack()
fr_df.fillna(value=0,inplace=True)
fr_df
Out[84]:
injury_severity Death Injured Not_injured
Manner_collision
Multi_vehicle 120 518 236
Single_vehicle 397 521 285
In [85]:
ax=fr_df.plot(kind="barh", title='Distribution of injury severity on different manner of collision')
ax.set_xlabel('count')
Out[85]:
Text(0.5, 0, 'count')

Observation: Accidents involving single vehicle collisions have higher death count compare to that of multiple vehicle collisions.

Distribution of injury severity with respect to manner of collision and crash events </b>

In [86]:
fr_df2=df_pct.groupby(['Manner_collision','first_crash_event'])['injury_severity'].value_counts().unstack()
fr_df2.fillna(value=0, inplace=True)
fr_df2
Out[86]:
injury_severity Death Injured Not_injured
Manner_collision first_crash_event
Multi_vehicle Collision_with_moving_obj 120 518 236
Single_vehicle Collision_with_fix_obj 272 346 50
Collision_with_moving_obj 28 51 196
Non_collision 97 124 39
In [87]:
ax=fr_df2.plot(kind='barh', \
            title='Distribution of injury severity on different manner of collision and the crash events')
ax.set_xlabel('count')
Out[87]:
Text(0.5, 0, 'count')

Observation: Accidents that involve single vehicle and collision with fix object have the highest death count among all other combinations of vehicles and collision objects.

Distribution of injury severity with respect to Airbags

In [88]:
df_pct_air_bag=df_pct[(df_pct.Manner_collision=='Single_vehicle') ]
fr_df3=df_pct_air_bag.groupby('air_bag')['injury_severity'].value_counts().unstack()
fr_df3.fillna(value=0, inplace=True)
fr_df3
Out[88]:
injury_severity Death Injured Not_injured
air_bag
Deployed 291 369 101
Not Deployed 106 152 184
In [89]:
ax=fr_df3.plot(kind='bar', title='Distribution of injury severity on Air Bags',\
           rot='horizontal')
ax.set_ylabel('count')
Out[89]:
Text(0, 0.5, 'count')

Observation: We further dig into whether air bag was deployed with respect to injury severity. And we found out that, death and injury counts were still higher in accidents which air bag was deployed compare to that of not deployed.

Distribution of injury severity with respect to Airbags and Restraints </b>

In [90]:
fr_df6=df_pct_air_bag.groupby(['air_bag','restraint_binary'])['injury_severity'].value_counts().unstack()
fr_df6.fillna(value=0, inplace=True)
fr_df6
Out[90]:
injury_severity Death Injured Not_injured
air_bag restraint_binary
Deployed Restraint Used 89 196 80
Restraint not used 202 173 21
Not Deployed Restraint Used 24 87 165
Restraint not used 82 65 19
In [91]:
ax=fr_df6.plot(kind='barh', title='Distribution of injury severity on restraints and air bags')
ax.set_xlabel('count')
Out[91]:
Text(0.5, 0, 'count')

Observation: From the above graph, it shows 4 combinations between restraints being used and if the airbag deployed. By comparing the death count between restraints used and not used, we can that restraints do play a big role in prevent death in accidents. However, the people who involved in majority of the accidents that led to death did not use restraints.

In [92]:
df_pct_air_bag['Injured'] = df_pct_air_bag.injury_severity.apply(lambda x: \
                                                                 1 if x=='Death' \
                                                                 else 0.5 if x=='Injured' else 0 )
df_pct_air_bag['restraint_used'] = df_pct_air_bag.restraint_binary.apply(lambda x: \
                                                                         1 if x=='Restraint Used'\
                                                                         else 0 )
df_pct_air_bag['air_bag_deployed'] = df_pct_air_bag.air_bag.apply(lambda x: 1 if x=='Deployed' else 0 )
In [93]:
df_ml3 = df_pct_air_bag.loc[:,["Injured",'restraint_binary','air_bag','Manner_collision']]
df_ml3_1 = df_pct_air_bag.loc[:,["Injured",'restraint_used','air_bag_deployed']]
In [94]:
df_ml3_b = pd.get_dummies(columns=['restraint_binary',"air_bag","Manner_collision"], data =df_ml3 )
In [95]:
df_ml3_check = df_ml3_b.copy()
In [96]:
cor1 = df_ml3_check.corr()
In [97]:
cor1[cor1 != 1].stack().nlargest(20)[::2]
Out[97]:
restraint_binary_Restraint not used  Injured                                0.427425
Injured                              air_bag_Deployed                       0.274966
air_bag_Not Deployed                 restraint_binary_Restraint Used        0.139921
restraint_binary_Restraint not used  air_bag_Deployed                       0.139921
air_bag_Not Deployed                 restraint_binary_Restraint not used   -0.139921
restraint_binary_Restraint not used  air_bag_Not Deployed                  -0.139921
Injured                              air_bag_Not Deployed                  -0.274966
                                     restraint_binary_Restraint Used       -0.427425
air_bag_Deployed                     air_bag_Not Deployed                  -1.000000
restraint_binary_Restraint Used      restraint_binary_Restraint not used   -1.000000
dtype: float64
In [98]:
dt = tree.DecisionTreeRegressor(max_depth=2)
In [99]:
X = df_ml3_1.drop(columns=["Injured"])
In [100]:
Y = df_ml3_1.Injured
In [101]:
dt.fit(X,Y)

# This code will visualize a decision tree dt, trained with the attributes in X and the class labels in Y
dt_feature_names = list(X.columns)
dt_target_names = [str(s) for s in Y.unique()]
tree.export_graphviz(dt, out_file='tree.dot', 
    feature_names=dt_feature_names, class_names=dt_target_names,
    filled=True)  
graph = pydotplus.graph_from_dot_file('tree.dot')
Image(graph.create_png())
Out[101]:
DecisionTreeRegressor(max_depth=2)
Out[101]:

Proof of Machine Learning findings:

Left side of the tree (left result): Without the use of restraints and that the airbag is not deployed, the person has an expected value of 0.69. According to the lambda function of 0.5 representing injured and 1 representing death, the person is likely to be injured or death.

Right side of the tree (left result): With the use of restraints and that the airbag is not deployed, the person has an expected value of 0.245. According to the lambda function of 0 representing not injured and 0.5 representing injured, the person is unlikely to be injured.

In [102]:
ax=fr_df6.plot(kind='barh', title='Distribution of injury severity on restraints and air bags')
ax.set_xlabel('count')
Out[102]:
Text(0.5, 0, 'count')

Conclusion:

  1. Single vehicle colliding with fix object have the highest death count
  2. The people who involved in majority of the accidents that led to death did not use restraints.

Distribustion of death across state due to lack of restraints used during accidents</b>

In [103]:
df_restraint_notused=df_pct_air_bag[(df_pct_air_bag.restraint_binary=='Restraint not used') &\
                (df_pct_air_bag.injury_severity=="Death")].groupby('state_name')['injury_severity'].\
                value_counts().unstack().reset_index()
df_restraint_notused.sort_values(by='Death',ascending=False).head()
Out[103]:
injury_severity state_name Death
35 Texas 43
4 California 39
12 Illinois 37
25 North Carolina 21
40 Wisconsin 14
In [104]:
df_restraint_notused['state_name_abbr']=df_restraint_notused.state_name.apply(lambda x:us.states.lookup(x).abbr)


fig = px.choropleth(df_restraint_notused, 
                    locations="state_name_abbr",
                    locationmode='USA-states',
                    color="Death", 
                    hover_name="state_name",
                    scope='usa',
                    title='Death distribution due to lack of restraints across different states',
                    width=800,
                    height=400,
                    color_continuous_scale=px.colors.sequential.Tealgrn)
fig.show()

Observation: Texas, California and Illinois have the highest death count due to lack of restraints used during accidents. Other states that are not colored do not have any lack-of-restraints accidents reported.

Mapbox distribution for the top 3 states from the above result

In [105]:
df_state_Death=df_pct_air_bag[(df_pct_air_bag.restraint_binary=='Restraint not used') &\
                (df_pct_air_bag.injury_severity=="Death")].\
groupby(['state_name','latitude','longitude'])\
['injury_severity'].value_counts().unstack().reset_index()
In [106]:
highest_state_injury=df_restraint_notused['Death'].sort_values(ascending=False).nlargest(3).index.to_list()
highest_3_state_injury=df_restraint_notused.iloc[[35, 4, 12],:]
highest_3_state_injury=highest_3_state_injury.state_name.to_list()
In [107]:
df_pct_air_bag_1=df_pct_air_bag[(df_pct_air_bag.restraint_binary=='Restraint not used') &\
                (df_pct_air_bag.injury_severity=="Death")]
df_pct_air_bag_1=df_pct_air_bag_1[df_pct_air_bag_1.apply(lambda x : \
                                                         x['state_name'] in highest_3_state_injury, axis=1)]
In [108]:
lat_mean=df_pct_air_bag_1.latitude.mean()
long_mean=df_pct_air_bag_1.longitude.mean()
In [109]:
px.set_mapbox_access_token ='pk.eyJ1IjoieW9nYXIiLCJhIjoiY2tpNGMzZjF3MTZocTJ0bnp2am83bm00ZCJ9.wzkHbz3AU14X1sfpgXE6oA'
fig = px.scatter_mapbox(data_frame=df_pct_air_bag_1, 
                        lat="latitude", 
                        lon="longitude",
                        color="injury_severity",
                        color_continuous_scale=px.colors.cyclical.Edge_r, 
                       )
fig.update_layout(mapbox_style="carto-positron",\
                  mapbox_zoom=5, mapbox_center = {"lat": lat_mean, "lon": long_mean})

Observation: The above map showing the coordinates of distribution in top 3 states having highest death count due to lack of restraints.

Recommendation:

  1. Increase fine for not using restraints.
  2. In Texas, California and Illinois, where there are highest death count due to lack of restraints used during accidents, the state government should install cameras in areas of death pockets to possibly catch people who did not use restraints.